This notebooks shows how to optimise the TensorFlow exported SavedModel by shrinking its size (to have less memory and disk footprints), and improving prediction latency. This can be accopmlished by applying the following:
The optimisation operations we apply in this example are from the TensorFlow Graph Conversion Tool, which is a c++ command-line tool. We use the Python APIs to call the c++ libraries.
The Graph Transform Tool is designed to work on models that are saved as GraphDef files, usually in a binary protobuf format. However, the model exported after training and estimator is in SavedModel format (saved_model.pb file + variables folder with variables.data-* and variables.index files).
We need to optimise the mode and keep it the SavedModel format. Thus, the optimisation steps will be:
In [1]:
import os
import numpy as np
from datetime import datetime
import tensorflow as tf
print "TensorFlow : {}".format(tf.__version__)
In [2]:
(train_data, train_labels), (eval_data, eval_labels) = tf.keras.datasets.mnist.load_data()
NUM_CLASSES = 10
In [3]:
print "Train data shape: {}".format(train_data.shape)
print "Eval data shape: {}".format(eval_data.shape)
In [4]:
def keras_model_fn(params):
inputs = tf.keras.layers.Input(shape=(28, 28), name='input_image')
input_layer = tf.keras.layers.Reshape(target_shape=(28, 28, 1), name='reshape')(inputs)
# convolutional layers
conv_inputs = input_layer
for i in range(params.num_conv_layers):
filters = params.init_filters * (2**i)
conv = tf.keras.layers.Conv2D(kernel_size=3, filters=filters, strides=1, padding='SAME', activation='relu')(conv_inputs)
max_pool = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding='SAME')(conv)
batch_norm = tf.keras.layers.BatchNormalization()(max_pool)
conv_inputs = batch_norm
flatten = tf.keras.layers.Flatten(name='flatten')(conv_inputs)
# fully-connected layers
dense_inputs = flatten
for i in range(len(params.hidden_units)):
dense = tf.keras.layers.Dense(units=params.hidden_units[i], activation='relu')(dense_inputs)
dropout = tf.keras.layers.Dropout(params.dropout)(dense)
dense_inputs = dropout
# softmax classifier
logits = tf.keras.layers.Dense(units=NUM_CLASSES, name='logits')(dense_inputs)
softmax = tf.keras.layers.Activation('softmax', name='softmax')(logits)
# keras model
model = tf.keras.models.Model(inputs, softmax)
return model
In [5]:
def create_estimator(params, run_config):
keras_model = keras_model_fn(params)
print keras_model.summary()
optimizer = tf.keras.optimizers.Adam(lr=params.learning_rate)
keras_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
mnist_classifier = tf.keras.estimator.model_to_estimator(
keras_model=keras_model,
config=run_config
)
return mnist_classifier
In [6]:
def run_experiment(params, run_config):
train_spec = tf.estimator.TrainSpec(
input_fn = tf.estimator.inputs.numpy_input_fn(
x={"input_image": train_data},
y=train_labels,
batch_size=params.batch_size,
num_epochs=None,
shuffle=True),
max_steps=params.max_traning_steps
)
eval_spec = tf.estimator.EvalSpec(
input_fn = tf.estimator.inputs.numpy_input_fn(
x={"input_image": eval_data},
y=eval_labels,
batch_size=params.batch_size,
num_epochs=1,
shuffle=False),
steps=None,
throttle_secs=params.eval_throttle_secs
)
tf.logging.set_verbosity(tf.logging.INFO)
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")
estimator = create_estimator(params, run_config)
tf.estimator.train_and_evaluate(
estimator=estimator,
train_spec=train_spec,
eval_spec=eval_spec
)
time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
return estimator
In [7]:
MODELS_LOCATION = 'models/mnist'
MODEL_NAME = 'keras_classifier'
model_dir = os.path.join(MODELS_LOCATION, MODEL_NAME)
print model_dir
params = tf.contrib.training.HParams(
batch_size=100,
hidden_units=[512, 512],
num_conv_layers=3,
init_filters=64,
dropout=0.2,
max_traning_steps=50,
eval_throttle_secs=10,
learning_rate=1e-3,
debug=True
)
run_config = tf.estimator.RunConfig(
tf_random_seed=19830610,
save_checkpoints_steps=1000,
keep_checkpoint_max=3,
model_dir=model_dir
)
In [8]:
if tf.gfile.Exists(model_dir):
print("Removing previous artifacts...")
tf.gfile.DeleteRecursively(model_dir)
os.makedirs(model_dir)
estimator = run_experiment(params, run_config)
In [9]:
def make_serving_input_receiver_fn():
inputs = {'input_image': tf.placeholder(shape=[None,28,28], dtype=tf.float32, name='serving_input_image')}
return tf.estimator.export.build_raw_serving_input_receiver_fn(inputs)
export_dir = os.path.join(model_dir, 'export')
if tf.gfile.Exists(export_dir):
tf.gfile.DeleteRecursively(export_dir)
estimator.export_savedmodel(
export_dir_base=export_dir,
serving_input_receiver_fn=make_serving_input_receiver_fn()
)
Out[9]:
In [10]:
%%bash
saved_models_base=models/mnist/keras_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all
In [33]:
def inference_test(saved_model_dir, signature="serving_default", input_name='input_image', batch=300, repeat=100):
tf.logging.set_verbosity(tf.logging.ERROR)
time_start = datetime.utcnow()
predictor = tf.contrib.predictor.from_saved_model(
export_dir = saved_model_dir,
signature_def_key=signature
)
time_end = datetime.utcnow()
time_elapsed = time_end - time_start
print ""
print("Model loading time: {} seconds".format(time_elapsed.total_seconds()))
print ""
time_start = datetime.utcnow()
output = None
for i in range(repeat):
predictions = predictor(
{
input_name: eval_data[:batch]
}
)
output=[np.argmax(prediction) for prediction in predictions['softmax']]
time_end = datetime.utcnow()
time_elapsed_sec = (time_end - time_start).total_seconds()
print "Inference elapsed time: {} seconds".format(time_elapsed_sec)
print ""
print "Prediction produced for {} instances batch, repeated {} times".format(len(output), repeat)
print "Average latency per batch: {} seconds".format(time_elapsed_sec/repeat)
print ""
In [34]:
saved_model_dir = os.path.join(
export_dir, [f for f in os.listdir(export_dir) if f.isdigit()][0])
print(saved_model_dir)
inference_test(saved_model_dir)
In [13]:
def describe_graph(graph_def, show_nodes=False):
print 'Input Feature Nodes: {}'.format([node.name for node in graph_def.node if node.op=='Placeholder'])
print ""
print 'Unused Nodes: {}'.format([node.name for node in graph_def.node if 'unused' in node.name])
print ""
print 'Output Nodes: {}'.format( [node.name for node in graph_def.node if 'softmax' in node.name])
print ""
print 'Quanitization Nodes: {}'.format( [node.name for node in graph_def.node if 'quant' in node.name])
print ""
print 'Constant Count: {}'.format( len([node for node in graph_def.node if node.op=='Const']))
print ""
print 'Variable Count: {}'.format( len([node for node in graph_def.node if 'Variable' in node.op]))
print ""
print 'Identity Count: {}'.format( len([node for node in graph_def.node if node.op=='Identity']))
print ""
print 'Total nodes: {}'.format( len(graph_def.node))
print ''
if show_nodes==True:
for node in graph_def.node:
print 'Op:{} - Name: {}'.format(node.op, node.name)
In [14]:
def get_graph_def_from_saved_model(saved_model_dir):
print saved_model_dir
print ""
from tensorflow.python.saved_model import tag_constants
with tf.Session() as session:
meta_graph_def = tf.saved_model.loader.load(
session,
tags=[tag_constants.SERVING],
export_dir=saved_model_dir
)
return meta_graph_def.graph_def
In [15]:
describe_graph(get_graph_def_from_saved_model(saved_model_dir))
In [16]:
def get_size(model_dir):
print model_dir
print ""
pb_size = os.path.getsize(os.path.join(model_dir,'saved_model.pb'))
variables_size = 0
if os.path.exists(os.path.join(model_dir,'variables/variables.data-00000-of-00001')):
variables_size = os.path.getsize(os.path.join(model_dir,'variables/variables.data-00000-of-00001'))
variables_size += os.path.getsize(os.path.join(model_dir,'variables/variables.index'))
print "Model size: {} KB".format(round(pb_size/(1024.0),3))
print "Variables size: {} KB".format(round( variables_size/(1024.0),3))
print "Total Size: {} KB".format(round((pb_size + variables_size)/(1024.0),3))
In [17]:
get_size(saved_model_dir)
In [18]:
def freeze_graph(saved_model_dir):
from tensorflow.python.tools import freeze_graph
from tensorflow.python.saved_model import tag_constants
output_graph_filename = os.path.join(saved_model_dir, "freezed_model.pb")
output_node_names = "softmax/Softmax"
initializer_nodes = ""
freeze_graph.freeze_graph(
input_saved_model_dir=saved_model_dir,
output_graph=output_graph_filename,
saved_model_tags = tag_constants.SERVING,
output_node_names=output_node_names,
initializer_nodes=initializer_nodes,
input_graph=None,
input_saver=False,
input_binary=False,
input_checkpoint=None,
restore_op_name=None,
filename_tensor_name=None,
clear_devices=False,
input_meta_graph=False,
)
print "SavedModel graph freezed!"
In [19]:
freeze_graph(saved_model_dir)
In [20]:
%%bash
saved_models_base=models/mnist/keras_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
In [21]:
def get_graph_def_from_file(graph_filepath):
print graph_filepath
print ""
from tensorflow.python import ops
with ops.Graph().as_default():
with tf.gfile.GFile(graph_filepath, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
return graph_def
In [22]:
freezed_filepath=os.path.join(saved_model_dir,'freezed_model.pb')
describe_graph(get_graph_def_from_file(freezed_filepath))
In [23]:
def optimize_graph(model_dir, graph_filename, transforms):
from tensorflow.tools.graph_transforms import TransformGraph
input_names = []
output_names = ['softmax/Softmax']
graph_def = get_graph_def_from_file(os.path.join(model_dir, graph_filename))
optimised_graph_def = TransformGraph(graph_def,
input_names,
output_names,
transforms
)
tf.train.write_graph(optimised_graph_def,
logdir=model_dir,
as_text=False,
name='optimised_model.pb')
print "Freezed graph optimised!"
In [24]:
transforms = [
'remove_nodes(op=Identity)',
'fold_constants(ignore_errors=true)',
'fold_batch_norms',
# 'fuse_resize_pad_and_conv',
# 'quantize_weights',
# 'quantize_nodes',
'merge_duplicate_nodes',
'strip_unused_nodes',
'sort_by_execution_order'
]
optimize_graph(saved_model_dir, 'freezed_model.pb', transforms)
In [25]:
%%bash
saved_models_base=models/mnist/keras_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
In [26]:
optimised_filepath=os.path.join(saved_model_dir,'optimised_model.pb')
describe_graph(get_graph_def_from_file(optimised_filepath))
In [27]:
def convert_graph_def_to_saved_model(graph_filepath):
from tensorflow.python import ops
export_dir=os.path.join(saved_model_dir,'optimised')
if tf.gfile.Exists(export_dir):
tf.gfile.DeleteRecursively(export_dir)
graph_def = get_graph_def_from_file(graph_filepath)
with tf.Session(graph=tf.Graph()) as session:
tf.import_graph_def(graph_def, name="")
tf.saved_model.simple_save(session,
export_dir,
inputs={
node.name: session.graph.get_tensor_by_name("{}:0".format(node.name))
for node in graph_def.node if node.op=='Placeholder'},
outputs={
"softmax": session.graph.get_tensor_by_name("softmax/Softmax:0"),
}
)
print "Optimised graph converted to SavedModel!"
In [28]:
optimised_filepath=os.path.join(saved_model_dir,'optimised_model.pb')
convert_graph_def_to_saved_model(optimised_filepath)
In [29]:
optimised_saved_model_dir = os.path.join(saved_model_dir,'optimised')
get_size(optimised_saved_model_dir)
In [30]:
%%bash
saved_models_base=models/mnist/keras_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)/optimised
ls ${saved_model_dir}
saved_model_cli show --dir ${saved_model_dir} --all
In [35]:
optimized_saved_model_dir = os.path.join(saved_model_dir,'optimised')
print(optimized_saved_model_dir)
inference_test(saved_model_dir=optimized_saved_model_dir, signature='serving_default', input_name='serving_input_image')
In [ ]:
PROJECT = 'ksalama-gcp-playground'
BUCKET = 'ksalama-gcs-cloudml'
REGION = 'europe-west1'
MODEL_NAME = 'mnist_classifier'
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['MODEL_NAME'] = MODEL_NAME
In [ ]:
%%bash
gsutil -m rm -r gs://${BUCKET}/tf-model-optimisation
In [ ]:
%%bash
saved_models_base=models/mnist/keras_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
gsutil -m cp -r ${saved_model_dir} gs://${BUCKET}/tf-model-optimisation/original
In [ ]:
%%bash
saved_models_base=models/mnist/keras_classifier/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)/optimised
echo ${saved_model_dir}
gsutil -m cp -r ${saved_model_dir} gs://${BUCKET}/tf-model-optimisation
In [ ]:
%%bash
echo ${MODEL_NAME}
gcloud ml-engine models create ${MODEL_NAME} --regions=${REGION}
Version: v_org is the original SavedModel (before optimisation)
In [ ]:
%%bash
MODEL_VERSION='v_org'
MODEL_ORIGIN=gs://${BUCKET}/tf-model-optimisation/original
gcloud ml-engine versions create ${MODEL_VERSION}\
--model=${MODEL_NAME} \
--origin=${MODEL_ORIGIN} \
--runtime-version=1.10
Version: v_opt is the optimised SavedModel (after optimisation)
In [ ]:
%%bash
MODEL_VERSION='v_opt'
MODEL_ORIGIN=gs://${BUCKET}/tf-model-optimisation/optimised
gcloud ml-engine versions create ${MODEL_VERSION}\
--model=${MODEL_NAME} \
--origin=${MODEL_ORIGIN} \
--runtime-version=1.10
In [ ]:
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
credentials = GoogleCredentials.get_application_default()
api = discovery.build(
'ml', 'v1',
credentials=credentials,
discoveryServiceUrl='https://storage.googleapis.com/cloud-ml/discovery/ml_v1_discovery.json'
)
def predict(version, instances):
request_data = {'instances': instances}
model_url = 'projects/{}/models/{}/versions/{}'.format(PROJECT, MODEL_NAME, version)
response = api.projects().predict(body=request_data, name=model_url).execute()
class_ids = None
try:
class_ids = [item["class_ids"] for item in response["predictions"]]
except:
print response
return class_ids
In [ ]:
def inference_cmle(version, batch=100, repeat=10):
instances = [
{'input_image_3': [float(i) for i in list(eval_data[img])] }
for img in range(batch)
]
#warmup request
predict(version, instances[0])
print 'Warm up request performed!'
print 'Timer started...'
print ''
time_start = datetime.utcnow()
output = None
for i in range(repeat):
output = predict(version, instances)
time_end = datetime.utcnow()
time_elapsed_sec = (time_end - time_start).total_seconds()
print "Inference elapsed time: {} seconds".format(time_elapsed_sec)
print ""
print "Prediction produced for {} instances batch, repeated {} times".format(len(output), repeat)
print "Average latency per batch: {} seconds".format(time_elapsed_sec/repeat)
print ""
print "Prediction output for the last instance: {}".format(output[0])
In [ ]:
version='v_org'
inference_cmle(version)
In [ ]:
version='v_opt'
inference_cmle(version)
In [ ]: